package com.free4inno.kmstika.utils;

import org.apache.tika.Tika;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;


/**
 * Author HUYUZHU.
 * Date 2021/3/26 11:31.
 */

public class TikaUtils {

    /**
     * Tika自动识别文件，提取文件内容.
     * 使用parser接口.
     *
     * @param url 存储在服务器的文件的url地址
     * @return parseResult 解析结果
     */
    public static String parseFile(String url) {
        // 实例化解析器对象
        AutoDetectParser autoDetectParser = new AutoDetectParser();
        // 实例化文件内容处理器对象，并设置更大的字符数限制
        BodyContentHandler handler = new BodyContentHandler(10 * 1024 * 1024);
        // 实例化元数据对象
        Metadata metadata = new Metadata();
        // 实例化（解析器）变量信息存储器对象
        ParseContext pcontext = new ParseContext();
        String parseResult = "";
        try {
            // 建立连接并获取位于服务器上的文件的流
            URL fileUrl = new URL(url);
            URLConnection conn = fileUrl.openConnection();
            InputStream inputStream = conn.getInputStream();
            try {
                // 解析文件并返回解析文本
                autoDetectParser.parse(inputStream, handler, metadata, pcontext);
                inputStream.close();
                parseResult = handler.toString();
            } catch (Exception e) {
                e.printStackTrace();
                parseResult = "Error parsing file";
            }
        } catch (Exception e) {
            e.printStackTrace();
            parseResult = "Error reading file";
        }
        return parseResult;
    }

    // Tika自动识别文件，提取文件内容(tika对象实例化)
    public static String getContext(File file) {
        try {
            Tika tika = new Tika();
            String filecontent = tika.parseToString(file);
            return filecontent;
        } catch (Exception e) {
            e.printStackTrace();
            return "";
        }
    }

    // Tika语言检测
    public static String languageDetection(File file) {
        try {
            Parser parser = new AutoDetectParser();
            BodyContentHandler handler = new BodyContentHandler();
            Metadata metadata = new Metadata();
            FileInputStream content = new FileInputStream(file);
            parser.parse(content, handler, metadata, new ParseContext());
            LanguageIdentifier object = new LanguageIdentifier(handler.toString());
            return object.getLanguage();
        } catch (Exception e) {
            e.printStackTrace();
            return "";
        }
    }

}
